setwd('/Users/mandyhong/Desktop/MATH 422')
allHC = read.csv('all_race_hc_ts.csv')
blackHC=read.csv('black_hc_ts.csv')
asianHC=read.csv('asian_hc_ts.csv')
tsplot(allHC$hate_crime)
tsplot(blackHC$hate_crime)
tsplot(asianHC$hate_crime)
culer = c(rgb(.85,.30,.12,.6), rgb(.12,.65,.85,.6), "aquamarine3")
tsplot(allHC$hate_crime, col=culer[1], lwd=2, pch=20, ylim=c(min(asianHC$hate_crime), max(allHC$hate_crime))
,ylab="Hate crimes", main="Racial Hate Crimes")
lines(blackHC$hate_crime, col=culer[2], lwd=2, pch=20)
lines(asianHC$hate_crime, col=culer[3], lwd=2, pch=20)
legend("topleft", col=culer, lty=1, lwd=2, pch=20, legend=c("All race", "African American/Black", "Asian"), bg="white")
culer = c(rgb(.85,.30,.12,.6), rgb(.12,.65,.85,.6), "aquamarine3")
tsplot(log(allHC$hate_crime), col=culer[1], lwd=2, pch=20, ylim=c(min(log(asianHC$hate_crime)), max(log(allHC$hate_crime)+1))
,ylab="log(Hate crimes)", main="Racial Hate Crimes")
lines(log(blackHC$hate_crime), col=culer[2], lwd=2, pch=20)
lines(log(asianHC$hate_crime), col=culer[3], lwd=2, pch=20)
legend("topleft", col=culer, lty=1, lwd=2, pch=20, legend=c("All race", "African American/Black", "Asian"), bg="white")
summary(allHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 198.0
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.: 319.0
## Median :2006 Median : 6.50 Median : 379.0
## Mean :2006 Mean : 6.50 Mean : 389.1
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.: 448.0
## Max. :2020 Max. :12.00 Max. :1329.0
summary(blackHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 88.0
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.:167.0
## Median :2006 Median : 6.50 Median :202.0
## Mean :2006 Mean : 6.50 Mean :207.7
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:242.5
## Max. :2020 Max. :12.00 Max. :693.0
summary(asianHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 3.00
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.:12.00
## Median :2006 Median : 6.50 Median :17.00
## Mean :2006 Mean : 6.50 Mean :17.84
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:23.00
## Max. :2020 Max. :12.00 Max. :51.00
allHC$difference<-c(0,diff(allHC$hate_crime))
iqr = IQR(diff(allHC$hate_crime))
Q <- quantile(allHC$difference, probs=c(.25, .75), na.rm = FALSE)
high <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
tsplot(allHC$difference, main="Detecting Outliers Using IQR Score")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')
blackHC$difference<-c(0,diff(blackHC$hate_crime))
iqr = IQR(diff(blackHC$hate_crime))
Q <- quantile(blackHC$difference, probs=c(.25, .75), na.rm = FALSE)
#Qtest<-quantile(blackHC$difference)
#Qtest[4] #3rd quntile
high <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
tsplot(blackHC$difference,main="Detecting Outliers Using IQR Score")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')
#blackHC[blackHC$difference > high, ]
#blackHC[blackHC$difference < low, ]
#outliers=c(which(blackHC$difference > high), which(blackHC$difference < low)) #12 outliers
#blackHC_no_outliers=blackHC[-outliers,]
#tsplot(blackHC_no_outliers$difference) #getting rid of outliers look stationary. In this case can we exclude outliers?
#tsplot(diff(blackHC_no_outliers$hate_crime)) #why do this plot and the plot above looks different?
#check for the outliers
asianHC$difference<-c(0,diff(asianHC$hate_crime))
iqr = IQR(diff(asianHC$hate_crime))
Q <- quantile(asianHC$difference, probs=c(.25, .75), na.rm = FALSE)
high <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
tsplot(asianHC$difference, main="Detecting Outliers Using IQR Score")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')
#asianHC[asianHC$difference > high, ]
#asianHC[asianHC$difference < low, ]
#outliers=c(which(asianHC$difference > high), which(asianHC$difference < low)) #8 outliers
#asianHC_no_outliers=asianHC[-outliers,]
ts1 <- xts(allHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", allHC$year, allHC$month)))
ts2 <- xts(allHC$hate_crime, as.yearmon(allHC$year + (allHC$month-1)/12))
#plot(ts1)
plot(ts2, main="All Racial Hate Crimes")
#balckHC=as.ts(blackHC)
#asianHC=as.ts(asianHC)
#start with linear model
t=1:360
mod1HC=lm(hate_crime~t, data=allHC)
summary(mod1HC) #Adjusted R-squared: 0.06522
##
## Call:
## lm(formula = hate_crime ~ t, data = allHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -241.68 -63.07 -7.12 50.68 925.42
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 439.96051 11.50834 38.230 < 2e-16 ***
## t -0.28199 0.05525 -5.104 5.42e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109 on 358 degrees of freedom
## Multiple R-squared: 0.06782, Adjusted R-squared: 0.06522
## F-statistic: 26.05 on 1 and 358 DF, p-value: 5.424e-07
#checking stationarity
tsplot(mod1HC$residuals) #didn't changed much from the original tsplot
plot(mod1HC$residuals) #doesn't look stationray, fit quadratic trend
lines(lowess(mod1HC$residuals))
#quadratic model
mod2HC=lm(hate_crime~t+I(t^2), data=allHC)
summary(mod2HC) #Adjusted R-squared: 0.06488 , didn't improve
##
## Call:
## lm(formula = hate_crime ~ t + I(t^2), data = allHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -229.80 -61.02 -8.74 48.63 920.90
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.279e+02 1.733e+01 24.696 <2e-16 ***
## t -8.180e-02 2.216e-01 -0.369 0.712
## I(t^2) -5.545e-04 5.946e-04 -0.933 0.352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109 on 357 degrees of freedom
## Multiple R-squared: 0.07009, Adjusted R-squared: 0.06488
## F-statistic: 13.45 on 2 and 357 DF, p-value: 2.329e-06
tsplot(mod2HC$residuals) #didn't changed much from the original tsplot
plot(mod2HC$residuals) #doesn't look stationray, seasonality
lines(lowess(mod2HC$residuals))
#cosine model
allHC$Xcos=cos(2*pi*t/12)
allHC$Xsin=sin(2*pi*t/12)
mod3HC=lm(hate_crime~t+I(t^2)+Xcos+Xsin,data=allHC)
summary(mod3HC) #Adjusted R-squared: 0.2178 , improved. t and t^2 not sig.
##
## Call:
## lm(formula = hate_crime ~ t + I(t^2) + Xcos + Xsin, data = allHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -221.22 -58.64 -7.05 36.36 888.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.285e+02 1.585e+01 27.039 < 2e-16 ***
## t -8.650e-02 2.027e-01 -0.427 0.670
## I(t^2) -5.499e-04 5.438e-04 -1.011 0.313
## Xcos -5.424e+01 7.429e+00 -7.301 1.89e-12 ***
## Xsin -3.193e+01 7.431e+00 -4.297 2.24e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.67 on 355 degrees of freedom
## Multiple R-squared: 0.2265, Adjusted R-squared: 0.2178
## F-statistic: 25.98 on 4 and 355 DF, p-value: < 2.2e-16
tsplot(mod3HC$residuals) #looks like it got rid of seasonal trend, but still not stationary
plot(mod3HC$residuals)
lines(lowess(mod3HC$residuals))#better than two models from above
#cosine model, dropping trend terms
mod4HC=lm(hate_crime~Xcos+Xsin,data=allHC)
summary(mod4HC) #Adjusted R-squared: 0.1503 not better than above
##
## Call:
## lm(formula = hate_crime ~ Xcos + Xsin, data = allHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -182.59 -71.94 -9.86 49.94 909.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 389.061 5.475 71.066 < 2e-16 ***
## Xcos -54.528 7.742 -7.043 9.74e-12 ***
## Xsin -30.868 7.742 -3.987 8.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.9 on 357 degrees of freedom
## Multiple R-squared: 0.155, Adjusted R-squared: 0.1503
## F-statistic: 32.75 on 2 and 357 DF, p-value: 8.745e-14
tsplot(mod4HC$residuals) #looks like it got rid of seasonal trend, but still not stationary
plot(mod4HC$residuals)
lines(lowess(mod4HC$residuals))#better than first two models from above
#seasonal model
mod5HC=lm(hate_crime~t+as.factor(month)+0, data=allHC)
summary(mod5HC) #Adjusted R-squared: 0.9417 #better than all above, maybe overfitting?
##
## Call:
## lm(formula = hate_crime ~ t + as.factor(month) + 0, data = allHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -221.73 -53.75 -2.05 37.68 866.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## t -0.28353 0.04964 -5.712 2.4e-08 ***
## as.factor(month)1 376.21794 19.86072 18.943 < 2e-16 ***
## as.factor(month)2 377.16814 19.88248 18.970 < 2e-16 ***
## as.factor(month)3 440.58500 19.90435 22.135 < 2e-16 ***
## as.factor(month)4 446.93520 19.92631 22.429 < 2e-16 ***
## as.factor(month)5 481.78540 19.94837 24.152 < 2e-16 ***
## as.factor(month)6 472.26893 19.97053 23.648 < 2e-16 ***
## as.factor(month)7 485.21913 19.99279 24.270 < 2e-16 ***
## as.factor(month)8 476.20266 20.01514 23.792 < 2e-16 ***
## as.factor(month)9 498.85286 20.03760 24.896 < 2e-16 ***
## as.factor(month)10 484.46972 20.06015 24.151 < 2e-16 ***
## as.factor(month)11 402.81992 20.08280 20.058 < 2e-16 ***
## as.factor(month)12 340.33678 20.10555 16.928 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 97.82 on 347 degrees of freedom
## Multiple R-squared: 0.9438, Adjusted R-squared: 0.9417
## F-statistic: 448 on 13 and 347 DF, p-value: < 2.2e-16
tsplot(mod5HC$residuals) #looks like it got rid of seasonal trend, but still not stationary
plot(mod5HC$residuals)
lines(lowess(mod5HC$residuals))#better than first two models from above, still not really linear
AIC(mod5HC) #4336.28
## [1] 4336.28
BIC(mod5HC) #4390.685
## [1] 4390.685
acf(mod5HC$residuals) #bad
None of the model fixed issue of non-linearity observed from the residuals. We cannot use function of time models for the prediction.
tsplot(allHC$hate_crime)
acf(allHC$hate_crime, lag=100) #suggests seasonal differencing
tsplot(diff(allHC$hate_crime, lag=12))
acf(diff(allHC$hate_crime, lag=12)) #seasonal patterns goes off
tsplot(diff(allHC$hate_crime))
acf(diff(allHC$hate_crime), lag=100) #definitely need seasonal differencing
tsplot(log(allHC$hate_crime)) #did not fix the issue with non-constant varaince a lot
acf(log(allHC$hate_crime)) #has some seasonal stuff
tsplot(diff(log(allHC$hate_crime), lag=12))
acf(diff(log(allHC$hate_crime), lag=12))
tsplot(diff(log(allHC$hate_crime)))
acf(diff(log(allHC$hate_crime)), lag=100) #seasonal stuff
#Seasonal differencing makes sense. Still need to check stationarity
adf.test(diff(allHC$hate_crime, lag=12))
## Warning in adf.test(diff(allHC$hate_crime, lag = 12)): p-value smaller than
## printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(allHC$hate_crime, lag = 12)
## Dickey-Fuller = -4.7544, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(allHC$hate_crime, lag=12))
## Warning in pp.test(diff(allHC$hate_crime, lag = 12)): p-value smaller than
## printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: diff(allHC$hate_crime, lag = 12)
## Dickey-Fuller Z(alpha) = -190.63, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(allHC$hate_crime, lag=12))
## Warning in kpss.test(diff(allHC$hate_crime, lag = 12)): p-value greater than
## printed p-value
##
## KPSS Test for Level Stationarity
##
## data: diff(allHC$hate_crime, lag = 12)
## KPSS Level = 0.23177, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(allHC$hate_crime, lag=12))) #pass all, stationary. We can proceed with fitting SARIMA model
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -841.93 -45.46 -12.03 17.39 802.40
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.07861 0.03157 -2.490 0.01325 *
## yd.diff.lag1 -0.43439 0.05832 -7.448 7.96e-13 ***
## yd.diff.lag2 -0.35763 0.06169 -5.797 1.55e-08 ***
## yd.diff.lag3 -0.16949 0.06018 -2.817 0.00514 **
## yd.diff.lag4 -0.10401 0.05409 -1.923 0.05533 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 94.55 on 338 degrees of freedom
## Multiple R-squared: 0.2332, Adjusted R-squared: 0.2219
## F-statistic: 20.56 on 5 and 338 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -2.4902
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
tsplot(diff(allHC$hate_crime, lag=12))
acf(diff(allHC$hate_crime, lag=12), lag=50) #cuts off at 12, SMA(1)
pacf(diff(allHC$hate_crime, lag=12), lag=50) #lag 1, 3 significant. 12 and 13 significant. 24 significant. AR(1)? SAR(2)?
#1. try SMA(1) which was most obvious
mod6HC=Arima(allHC$hate_crime,order=c(0,0,0), seasonal=list(order=c(0,1,1),period=12), include.drift = TRUE)
summary(mod6HC) #AIC=4185.99 AICc=4186.06 BIC=4197.55
## Series: allHC$hate_crime
## ARIMA(0,0,0)(0,1,1)[12] with drift
##
## Coefficients:
## sma1 drift
## -0.5361 0.2571
## s.e. 0.0612 0.2225
##
## sigma^2 = 9582: log likelihood = -2090
## AIC=4185.99 AICc=4186.06 BIC=4197.55
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3.478633 95.96676 55.20574 -1.895506 13.35359 1.076936 0.5564345
coeftest(mod6HC) #drift not sig.
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## sma1 -0.536100 0.061206 -8.7589 <2e-16 ***
## drift 0.257133 0.222515 1.1556 0.2479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod6.2HC=Arima(allHC$hate_crime,order=c(0,0,0), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
summary(mod6.2HC) #AIC=4185.42 AICc=4185.46 BIC=4193.12, better AIC and BIC. won't include drift term
## Series: allHC$hate_crime
## ARIMA(0,0,0)(0,1,1)[12]
##
## Coefficients:
## sma1
## -0.5506
## s.e. 0.0621
##
## sigma^2 = 9587: log likelihood = -2090.71
## AIC=4185.42 AICc=4185.46 BIC=4193.12
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 9.528176 96.12702 54.42133 -0.2716837 13.01058 1.061634 0.5570456
coeftest(mod6.2HC)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## sma1 -0.550624 0.062084 -8.869 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#chek stationarity of the residual
tsplot(mod6.2HC$residuals)
adf.test(mod6.2HC$residuals)
##
## Augmented Dickey-Fuller Test
##
## data: mod6.2HC$residuals
## Dickey-Fuller = -3.5799, Lag order = 7, p-value = 0.03516
## alternative hypothesis: stationary
pp.test(mod6.2HC$residuals)
## Warning in pp.test(mod6.2HC$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod6.2HC$residuals
## Dickey-Fuller Z(alpha) = -171.37, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod6.2HC$residuals) #0.03812
##
## KPSS Test for Level Stationarity
##
## data: mod6.2HC$residuals
## KPSS Level = 0.51574, Truncation lag parameter = 5, p-value = 0.03812
summary(ur.ers(mod6.2HC$residuals))
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -379.06 -26.76 -1.19 24.67 847.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.29824 0.05939 -5.021 8.19e-07 ***
## yd.diff.lag1 -0.23778 0.06849 -3.472 0.000582 ***
## yd.diff.lag2 -0.20396 0.06734 -3.029 0.002638 **
## yd.diff.lag3 -0.05792 0.06209 -0.933 0.351521
## yd.diff.lag4 -0.01885 0.05448 -0.346 0.729639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.83 on 350 degrees of freedom
## Multiple R-squared: 0.2542, Adjusted R-squared: 0.2435
## F-statistic: 23.86 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -5.0215
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod6.2HC$residuals, lag=50) #decay, AR
pacf(mod6.2HC$residuals, lag=50) #AR(2), also lag 24 is significant
#2. add AR(2) term
mod7HC=Arima(allHC$hate_crime,order=c(2,0,0), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
summary(mod7HC) #AIC=4034.43 AICc=4034.55 BIC=4049.84, improved
## Series: allHC$hate_crime
## ARIMA(2,0,0)(0,1,1)[12]
##
## Coefficients:
## ar1 ar2 sma1
## 0.5433 0.1385 -0.8318
## s.e. 0.0540 0.0535 0.0503
##
## sigma^2 = 5996: log likelihood = -2013.22
## AIC=4034.43 AICc=4034.55 BIC=4049.84
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.766899 75.80656 38.07757 -1.348706 9.17772 0.7428053 -0.02594358
coeftest(mod7HC)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.543260 0.053978 10.064 < 2e-16 ***
## ar2 0.138463 0.053544 2.586 0.00971 **
## sma1 -0.831829 0.050307 -16.535 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tsplot(mod7HC$residuals) #looks better
adf.test(mod7HC$residuals)
## Warning in adf.test(mod7HC$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod7HC$residuals
## Dickey-Fuller = -4.6994, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod7HC$residuals)
## Warning in pp.test(mod7HC$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod7HC$residuals
## Dickey-Fuller Z(alpha) = -361.01, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod7HC$residuals) #0.01
## Warning in kpss.test(mod7HC$residuals): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod7HC$residuals
## KPSS Level = 0.80086, Truncation lag parameter = 5, p-value = 0.01
summary(ur.ers(mod7HC$residuals))
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -243.90 -28.73 -3.28 23.44 859.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.93912 0.12439 -7.550 3.83e-13 ***
## yd.diff.lag1 -0.08793 0.11243 -0.782 0.4347
## yd.diff.lag2 -0.20683 0.09729 -2.126 0.0342 *
## yd.diff.lag3 -0.12013 0.07640 -1.572 0.1168
## yd.diff.lag4 -0.05885 0.05345 -1.101 0.2717
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.77 on 350 degrees of freedom
## Multiple R-squared: 0.5263, Adjusted R-squared: 0.5196
## F-statistic: 77.78 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -7.5495
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod7HC$residuals, lag=50) #several lags with small significance
pacf(mod7HC$residuals, lag=50) #lag 2 is significant
#3. try auto.arima
mod8HC=auto.arima(allHC$hate_crime)
summary(mod8HC) #suggests ARIMA(1,1,1), AIC=4192.2 AICc=4192.27 BIC=4203.85, not better
## Series: allHC$hate_crime
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.5375 -0.9395
## s.e. 0.0545 0.0221
##
## sigma^2 = 6805: log likelihood = -2093.1
## AIC=4192.2 AICc=4192.27 BIC=4203.85
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 4.016691 82.14928 46.08507 -1.519288 11.67812 0.8990131 0.01779191
tsplot(mod8HC$residuals) #looks better
adf.test(mod8HC$residuals)
## Warning in adf.test(mod8HC$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod8HC$residuals
## Dickey-Fuller = -7.2941, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod8HC$residuals)
## Warning in pp.test(mod8HC$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod8HC$residuals
## Dickey-Fuller Z(alpha) = -340.79, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod8HC$residuals)
## Warning in kpss.test(mod8HC$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod8HC$residuals
## KPSS Level = 0.27729, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod8HC$residuals)) #passed all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -311.13 -31.12 4.98 31.42 880.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -1.08007 0.11872 -9.098 <2e-16 ***
## yd.diff.lag1 0.10104 0.10600 0.953 0.341
## yd.diff.lag2 0.08494 0.09186 0.925 0.356
## yd.diff.lag3 0.08215 0.07494 1.096 0.274
## yd.diff.lag4 0.07490 0.05354 1.399 0.163
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83.05 on 350 degrees of freedom
## Multiple R-squared: 0.4917, Adjusted R-squared: 0.4844
## F-statistic: 67.71 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -9.0977
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod8HC$residuals, lag=100) #need seasonal diff.
pacf(mod8HC$residuals, lag=50) #lag 12 is significant and cuts off. SAR(1)
#4. try adding additional terms to mod7HC and see if model improves/fix problem from acf and pacf
#add ma(1)
mod9HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
summary(mod9HC) #AIC=4017.1 AICc=4017.27 BIC=4036.36, improved
## Series: allHC$hate_crime
## ARIMA(2,0,1)(0,1,1)[12]
##
## Coefficients:
## ar1 ar2 ma1 sma1
## 1.3100 -0.3239 -0.8332 -0.8858
## s.e. 0.0924 0.0832 0.0658 0.0443
##
## sigma^2 = 5642: log likelihood = -2003.55
## AIC=4017.1 AICc=4017.27 BIC=4036.36
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 3.504163 73.42226 35.84429 -0.5534322 8.520611 0.6992392
## ACF1
## Training set 0.008615474
coeftest(mod9HC) #all sig.
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 1.309959 0.092409 14.1757 < 2.2e-16 ***
## ar2 -0.323904 0.083188 -3.8936 9.875e-05 ***
## ma1 -0.833242 0.065763 -12.6704 < 2.2e-16 ***
## sma1 -0.885843 0.044287 -20.0022 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tsplot(mod9HC$residuals) #looks okay
adf.test(mod9HC$residuals)
## Warning in adf.test(mod9HC$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod9HC$residuals
## Dickey-Fuller = -6.0964, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod9HC$residuals)
## Warning in pp.test(mod9HC$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod9HC$residuals
## Dickey-Fuller Z(alpha) = -348.9, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod9HC$residuals)
## Warning in kpss.test(mod9HC$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod9HC$residuals
## KPSS Level = 0.19751, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod9HC$residuals)) #passed all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.56 -23.99 -0.16 20.68 854.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -1.00651 0.11986 -8.397 1.15e-15 ***
## yd.diff.lag1 0.02163 0.10765 0.201 0.841
## yd.diff.lag2 -0.03271 0.09368 -0.349 0.727
## yd.diff.lag3 0.01588 0.07529 0.211 0.833
## yd.diff.lag4 0.02188 0.05364 0.408 0.684
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.27 on 350 degrees of freedom
## Multiple R-squared: 0.4963, Adjusted R-squared: 0.4891
## F-statistic: 68.97 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -8.3974
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod9HC$residuals, lag=50) #looks good
pacf(mod9HC$residuals, lag=50) #looks good
#5. fixed problems identified by acf and pacf, now add terms to see if mod9HC is the best model
#add SAR(1)
mod10HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(1,1,1),period=12), include.drift = FALSE)
summary(mod10HC) #AIC=4018.41 AICc=4018.65 BIC=4041.52, got worse
## Series: allHC$hate_crime
## ARIMA(2,0,1)(1,1,1)[12]
##
## Coefficients:
## ar1 ar2 ma1 sar1 sma1
## 1.3095 -0.3248 -0.8314 0.0598 -0.9035
## s.e. 0.0970 0.0861 0.0706 0.0722 0.0476
##
## sigma^2 = 5636: log likelihood = -2003.2
## AIC=4018.41 AICc=4018.65 BIC=4041.52
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3.525461 73.2823 35.7457 -0.5631361 8.50513 0.6973159 0.007914683
coeftest(mod10HC) #sar1 not sig
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 1.309549 0.097003 13.5001 < 2.2e-16 ***
## ar2 -0.324837 0.086072 -3.7740 0.0001606 ***
## ma1 -0.831413 0.070611 -11.7746 < 2.2e-16 ***
## sar1 0.059780 0.072215 0.8278 0.4077815
## sma1 -0.903524 0.047553 -19.0004 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tsplot(mod10HC$residuals)
adf.test(mod10HC$residuals)
## Warning in adf.test(mod10HC$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod10HC$residuals
## Dickey-Fuller = -6.0792, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod10HC$residuals)
## Warning in pp.test(mod10HC$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod10HC$residuals
## Dickey-Fuller Z(alpha) = -349.75, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod10HC$residuals)
## Warning in kpss.test(mod10HC$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod10HC$residuals
## KPSS Level = 0.19902, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod10HC$residuals)) #passed all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -224.21 -23.35 0.86 21.22 850.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -1.00190 0.11980 -8.363 1.47e-15 ***
## yd.diff.lag1 0.01610 0.10763 0.150 0.881
## yd.diff.lag2 -0.03669 0.09366 -0.392 0.695
## yd.diff.lag3 0.01159 0.07531 0.154 0.878
## yd.diff.lag4 0.01876 0.05363 0.350 0.727
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.14 on 350 degrees of freedom
## Multiple R-squared: 0.4965, Adjusted R-squared: 0.4893
## F-statistic: 69.03 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -8.3629
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod10HC$residuals, lag=50) #looks good
pacf(mod10HC$residuals, lag=50) #looks good
#add regular diff.
mod11HC=Arima(allHC$hate_crime,order=c(2,1,1), seasonal=list(order=c(1,1,1),period=12), include.drift = FALSE)
summary(mod11HC) #AIC=4010.96 AICc=4011.21 BIC=4034.06, got better
## Series: allHC$hate_crime
## ARIMA(2,1,1)(1,1,1)[12]
##
## Coefficients:
## ar1 ar2 ma1 sar1 sma1
## 0.3276 -0.0513 -0.8304 0.0462 -0.8973
## s.e. 0.0832 0.0707 0.0658 0.0715 0.0477
##
## sigma^2 = 5680: log likelihood = -1999.48
## AIC=4010.96 AICc=4011.21 BIC=4034.06
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -3.764594 73.457 35.93096 -2.124327 8.600325 0.7009299
## ACF1
## Training set -9.150525e-05
coeftest(mod11HC) #two terms not sig. ar2, sar1
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.327572 0.083181 3.9380 8.215e-05 ***
## ar2 -0.051287 0.070706 -0.7254 0.4682
## ma1 -0.830389 0.065779 -12.6239 < 2.2e-16 ***
## sar1 0.046235 0.071496 0.6467 0.5178
## sma1 -0.897290 0.047750 -18.7914 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tsplot(mod11HC$residuals)
adf.test(mod11HC$residuals)
## Warning in adf.test(mod11HC$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod11HC$residuals
## Dickey-Fuller = -6.6266, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod11HC$residuals)
## Warning in pp.test(mod11HC$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod11HC$residuals
## Dickey-Fuller Z(alpha) = -356.48, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod11HC$residuals) #0.02322
##
## KPSS Test for Level Stationarity
##
## data: mod11HC$residuals
## KPSS Level = 0.59353, Truncation lag parameter = 5, p-value = 0.02322
summary(ur.ers(mod11HC$residuals))
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -238.14 -28.44 -2.77 17.99 843.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.98769 0.11889 -8.308 2.17e-15 ***
## yd.diff.lag1 -0.01088 0.10704 -0.102 0.919
## yd.diff.lag2 -0.02948 0.09365 -0.315 0.753
## yd.diff.lag3 0.02252 0.07598 0.296 0.767
## yd.diff.lag4 0.02249 0.05373 0.419 0.676
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.29 on 350 degrees of freedom
## Multiple R-squared: 0.5005, Adjusted R-squared: 0.4934
## F-statistic: 70.15 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -8.3076
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod11HC$residuals, lag=50) #looks good
pacf(mod11HC$residuals, lag=50) #looks good, AIC and BIC improved but compared to mod9HC, this didn't passed one stationarity test and two terms were not significant. Thus, we will use mod9HC as best model.
#comparing 7 different SARIMA model, our best model is
mod9HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
summary(mod9HC) #AIC=4017.1 AICc=4017.27 BIC=4036.36
## Series: allHC$hate_crime
## ARIMA(2,0,1)(0,1,1)[12]
##
## Coefficients:
## ar1 ar2 ma1 sma1
## 1.3100 -0.3239 -0.8332 -0.8858
## s.e. 0.0924 0.0832 0.0658 0.0443
##
## sigma^2 = 5642: log likelihood = -2003.55
## AIC=4017.1 AICc=4017.27 BIC=4036.36
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 3.504163 73.42226 35.84429 -0.5534322 8.520611 0.6992392
## ACF1
## Training set 0.008615474
coeftest(mod9HC)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 1.309959 0.092409 14.1757 < 2.2e-16 ***
## ar2 -0.323904 0.083188 -3.8936 9.875e-05 ***
## ma1 -0.833242 0.065763 -12.6704 < 2.2e-16 ***
## sma1 -0.885843 0.044287 -20.0022 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(fitted(mod9HC), allHC$hate_crime)^2 #0.5787597
## [1] 0.5787597
#check normality
qqPlot(mod9HC$residuals) #not normal, outliers
## [1] 129 354
densityplot(as.numeric(mod9HC$residuals)) #some outliers
shapiro.test(mod9HC$residuals) #not normal, we can't trust forecast interval
##
## Shapiro-Wilk normality test
##
## data: mod9HC$residuals
## W = 0.56305, p-value < 2.2e-16
Best model is SARIMA (2,0,1)x(0,1,1).
#best function of time model: seasonal model
t=1:360
mod5HC=lm(hate_crime~t+as.factor(month)+0, data=allHC)
#summary(mod5HC) #Adjusted R-squared: 0.2464 #better than all above
#AIC(mod5HC) #4336.28
#BIC(mod5HC) #4390.685
#best SARIMA model
mod9HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
#summary(mod9HC) #AIC=4017.1 AICc=4017.27 BIC=4036.36
#compare forecasting performance of two models
#str(allHC)
a=allHC[1:348,] #including only through 2018
t=1:348
foftimeMod = lm(hate_crime~t+as.factor(month)+0, data=a)
AIC(foftimeMod) #4066.171
## [1] 4066.171
BIC(foftimeMod) #4120.102
## [1] 4120.102
foftimePred= predict(foftimeMod, data.frame(t=349:360, month=1:12))
sarimaMod=Arima(a$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
#summary(sarimaMod) #AIC=3780.98 AICc=3781.16 BIC=3800.06
sarimaPred=forecast(sarimaMod,h=12)$mean
#real data
real_value=allHC$hate_crime[c(349:360)]
#compare with real data
cor(real_value,foftimePred)^2 # R^2 = 0.2608992
## [1] 0.2608992
cor(real_value,sarimaPred)^2 # R^2 = 0.3221583, this has best prediction
## [1] 0.3221583
SARIMA model is better.
#plot prediction
pred_allHC=forecast(mod9HC,h=24) #twp years, 2021, 2022
plot(pred_allHC, main="Prediction on all racial hate crimes", xlab="Time", ylab="Hate crimes")
forecasting2<-sarima.for(allHC$hate_crime, 2,0,1, 0,1,1, 12, n.ahead = 24)
forecasting2$pred
## Time Series:
## Start = 361
## End = 384
## Frequency = 1
## [1] 420.6226 441.0736 499.7341 503.8945 551.0491 599.8965 588.0991 569.8787
## [9] 560.5525 550.7625 489.0558 430.3178 444.3148 449.4107 502.9406 505.4231
## [17] 552.0655 600.7935 589.0087 570.8446 561.5887 551.8730 490.2411 431.5774
pred_allHC
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 361 416.4140 320.1489 512.6791 269.1892 563.6388
## 362 434.0076 327.3642 540.6510 270.9106 597.1046
## 363 490.6057 380.1072 601.1042 321.6128 659.5986
## 364 493.0561 380.1816 605.9305 320.4295 665.6826
## 365 538.5774 423.8014 653.3534 363.0427 714.1122
## 366 585.6799 469.2097 702.1501 407.5541 763.8057
## 367 572.5839 454.5436 690.6243 392.0569 753.1110
## 368 553.0345 433.5182 672.5509 370.2501 735.8190
## 369 542.4946 421.5833 663.4060 357.5767 727.4126
## 370 531.3875 409.1547 653.6203 344.4485 718.3264
## 371 468.3682 344.8821 591.8543 279.5125 657.2239
## 372 408.3875 283.7119 533.0631 217.7126 599.0624
## 373 420.7937 293.0511 548.5363 225.4282 616.1591
## 374 424.3957 294.8201 553.9714 226.2269 622.5645
## 375 476.5960 345.5707 607.6212 276.2101 676.9818
## 376 477.8172 345.5026 610.1318 275.4595 680.1749
## 377 523.1530 389.6408 656.6651 318.9638 727.3422
## 378 570.4104 435.7687 705.0521 364.4937 776.3271
## 379 557.5776 421.8645 693.2906 350.0224 765.1328
## 380 538.3227 401.5912 675.0541 329.2100 747.4353
## 381 528.0833 390.3829 665.7836 317.4887 738.6778
## 382 517.2744 378.6515 655.8973 305.2690 729.2798
## 383 454.5486 315.0470 594.0501 241.1994 667.8978
## 384 394.8556 254.5170 535.1942 180.2263 609.4850
summary(blackHC)
## year month hate_crime difference
## Min. :1991 Min. : 1.00 Min. : 88.0 Min. :-206.0000
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.:167.0 1st Qu.: -22.0000
## Median :2006 Median : 6.50 Median :202.0 Median : 2.0000
## Mean :2006 Mean : 6.50 Mean :207.7 Mean : 0.2444
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:242.5 3rd Qu.: 21.0000
## Max. :2020 Max. :12.00 Max. :693.0 Max. : 456.0000
ts3 <- xts(blackHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", blackHC$year, blackHC$month)))
ts4 <- xts(blackHC$hate_crime, as.yearmon(blackHC$year + (blackHC$month-1)/12))
plot(ts4, main="Anti-African American or Black Hate Crimes")
#par(mfrow=1:2)
#for (i in 1:30){
# start=12*(i-1)+1
# end=12*i
# print(plot(ts4[start:end]))
#}
#start with linear model
t=1:360
mod1B=lm(hate_crime~t, data=blackHC)
summary(mod1B) #Adjusted R-squared: 0.07153
##
## Call:
## lm(formula = hate_crime ~ t, data = blackHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -148.47 -35.69 -3.80 27.88 513.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 236.63067 6.24207 37.909 < 2e-16 ***
## t -0.16043 0.02997 -5.353 1.55e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.09 on 358 degrees of freedom
## Multiple R-squared: 0.07411, Adjusted R-squared: 0.07153
## F-statistic: 28.66 on 1 and 358 DF, p-value: 1.547e-07
#checking stationarity
plot(mod1B$residuals) #doesn't look stationray, fit quadratic trend
lines(lowess(mod1B$residuals))
#quadratic model
mod2B=lm(hate_crime~t+I(t^2), data=blackHC)
summary(mod2B) #Adjusted R-squared: 0.08305
##
## Call:
## lm(formula = hate_crime ~ t + I(t^2), data = blackHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132.38 -35.48 -4.16 27.64 527.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.203e+02 9.337e+00 23.590 <2e-16 ***
## t 1.108e-01 1.194e-01 0.928 0.3541
## I(t^2) -7.514e-04 3.204e-04 -2.345 0.0196 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58.73 on 357 degrees of freedom
## Multiple R-squared: 0.08816, Adjusted R-squared: 0.08305
## F-statistic: 17.26 on 2 and 357 DF, p-value: 7.005e-08
#tsplot(mod2B$residuals) #didn't changed much from the original tsplot
plot(mod2B$residuals) #doesn't look stationray, seasonality
lines(lowess(mod2B$residuals))
#cosine model
blackHC$Xcos=cos(2*pi*t/12)
blackHC$Xsin=sin(2*pi*t/12)
mod3B=lm(hate_crime~t+I(t^2)+Xcos+Xsin,data=blackHC)
summary(mod3B) #Adjusted R-squared: 0.228 , improved. t and t^2 not sig.
##
## Call:
## lm(formula = hate_crime ~ t + I(t^2) + Xcos + Xsin, data = blackHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -135.72 -31.35 -4.31 24.78 498.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.206e+02 8.569e+00 25.740 < 2e-16 ***
## t 1.086e-01 1.096e-01 0.991 0.322434
## I(t^2) -7.489e-04 2.940e-04 -2.547 0.011283 *
## Xcos -2.954e+01 4.016e+00 -7.355 1.33e-12 ***
## Xsin -1.551e+01 4.018e+00 -3.861 0.000134 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.89 on 355 degrees of freedom
## Multiple R-squared: 0.2366, Adjusted R-squared: 0.228
## F-statistic: 27.5 on 4 and 355 DF, p-value: < 2.2e-16
tsplot(mod3B$residuals) #looks like it got rid of seasonal trend, but still not stationary
plot(mod3B$residuals)
lines(lowess(mod3B$residuals))#better than two models from above
#cosine model, dropping trend terms
mod4B=lm(hate_crime~Xcos+Xsin,data=blackHC)
summary(mod4B) #Adjusted R-squared: 0.1426 not better than above
##
## Call:
## lm(formula = hate_crime ~ Xcos + Xsin, data = blackHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -122.39 -37.79 -4.44 28.84 455.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.672 2.993 69.386 < 2e-16 ***
## Xcos -29.715 4.233 -7.020 1.12e-11 ***
## Xsin -14.912 4.233 -3.523 0.000482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.79 on 357 degrees of freedom
## Multiple R-squared: 0.1474, Adjusted R-squared: 0.1426
## F-statistic: 30.85 on 2 and 357 DF, p-value: 4.387e-13
tsplot(mod4B$residuals) #looks like it got rid of seasonal trend, but still not stationary
plot(mod4B$residuals)
lines(lowess(mod4B$residuals))#better than first two models from above
#seasonal model
mod5B=lm(hate_crime~t+as.factor(month)+0, data=blackHC)
summary(mod5B) #Adjusted R-squared: 0.9388 #better than all above
##
## Call:
## lm(formula = hate_crime ~ t + as.factor(month) + 0, data = blackHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.46 -30.01 -1.42 23.95 490.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## t -0.16089 0.02718 -5.919 7.76e-09 ***
## as.factor(month)1 203.22164 10.87447 18.688 < 2e-16 ***
## as.factor(month)2 205.68252 10.88638 18.894 < 2e-16 ***
## as.factor(month)3 239.81008 10.89835 22.004 < 2e-16 ***
## as.factor(month)4 241.03763 10.91038 22.093 < 2e-16 ***
## as.factor(month)5 252.39851 10.92246 23.108 < 2e-16 ***
## as.factor(month)6 259.42607 10.93459 23.725 < 2e-16 ***
## as.factor(month)7 261.62028 10.94678 23.899 < 2e-16 ***
## as.factor(month)8 263.08117 10.95902 24.006 < 2e-16 ***
## as.factor(month)9 254.14206 10.97131 23.164 < 2e-16 ***
## as.factor(month)10 261.03627 10.98366 23.766 < 2e-16 ***
## as.factor(month)11 217.43049 10.99606 19.773 < 2e-16 ***
## as.factor(month)12 181.65805 11.00852 16.502 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.56 on 347 degrees of freedom
## Multiple R-squared: 0.941, Adjusted R-squared: 0.9388
## F-statistic: 425.8 on 13 and 347 DF, p-value: < 2.2e-16
tsplot(mod5B$residuals) #looks like it got rid of seasonal trend, but still not stationary
plot(mod5B$residuals)
lines(lowess(mod5B$residuals))#better than first two models from above
AIC(mod5B)
## [1] 3902.605
BIC(mod5B)
## [1] 3957.01
#none of the residuals looks linear. try SARIMA
None of the model fixed issue of non-linearity observed from the residuals. We cannot use function of time models for the prediction.
tsplot(blackHC$hate_crime) #doesn't look stationary
adf.test(blackHC$hate_crime)
##
## Augmented Dickey-Fuller Test
##
## data: blackHC$hate_crime
## Dickey-Fuller = -3.8531, Lag order = 7, p-value = 0.01662
## alternative hypothesis: stationary
pp.test(blackHC$hate_crime)
## Warning in pp.test(blackHC$hate_crime): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: blackHC$hate_crime
## Dickey-Fuller Z(alpha) = -94.405, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(blackHC$hate_crime) #fail
## Warning in kpss.test(blackHC$hate_crime): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: blackHC$hate_crime
## KPSS Level = 1.2825, Truncation lag parameter = 5, p-value = 0.01
summary(ur.ers(blackHC$hate_crime))
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.84 -17.82 4.91 25.28 472.06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.03800 0.01933 -1.965 0.05017 .
## yd.diff.lag1 -0.15915 0.05508 -2.889 0.00410 **
## yd.diff.lag2 -0.17240 0.05527 -3.119 0.00196 **
## yd.diff.lag3 -0.13262 0.05471 -2.424 0.01585 *
## yd.diff.lag4 -0.07799 0.05443 -1.433 0.15277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41.85 on 350 degrees of freedom
## Multiple R-squared: 0.07575, Adjusted R-squared: 0.06255
## F-statistic: 5.737 on 5 and 350 DF, p-value: 4.187e-05
##
##
## Value of test-statistic is: -1.9653
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(blackHC$hate_crime, lag=100) #doesn't look stationary, might need seasonal differencing
tsplot(diff(blackHC$hate_crime, lag=12)) #looks better but some concern with variance
adf.test(diff(blackHC$hate_crime, lag=12))
##
## Augmented Dickey-Fuller Test
##
## data: diff(blackHC$hate_crime, lag = 12)
## Dickey-Fuller = -3.9586, Lag order = 7, p-value = 0.01137
## alternative hypothesis: stationary
pp.test(diff(blackHC$hate_crime, lag=12))
## Warning in pp.test(diff(blackHC$hate_crime, lag = 12)): p-value smaller than
## printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: diff(blackHC$hate_crime, lag = 12)
## Dickey-Fuller Z(alpha) = -129.26, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(blackHC$hate_crime, lag=12))
## Warning in kpss.test(diff(blackHC$hate_crime, lag = 12)): p-value greater than
## printed p-value
##
## KPSS Test for Level Stationarity
##
## data: diff(blackHC$hate_crime, lag = 12)
## KPSS Level = 0.28315, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(blackHC$hate_crime, lag=12))) #pass all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -126.53 -26.14 -4.49 14.71 479.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.045894 0.022544 -2.036 0.04255 *
## yd.diff.lag1 -0.305953 0.056922 -5.375 1.43e-07 ***
## yd.diff.lag2 -0.292114 0.058250 -5.015 8.59e-07 ***
## yd.diff.lag3 -0.182152 0.057354 -3.176 0.00163 **
## yd.diff.lag4 -0.006499 0.055326 -0.117 0.90656
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.99 on 338 degrees of freedom
## Multiple R-squared: 0.1564, Adjusted R-squared: 0.1439
## F-statistic: 12.53 on 5 and 338 DF, p-value: 3.562e-11
##
##
## Value of test-statistic is: -2.0358
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(diff(blackHC$hate_crime, lag=12), lag=100) #doesn't look stationary, might need regular differencing?
tsplot(diff(diff(blackHC$hate_crime, lag=12)))
adf.test(diff(diff(blackHC$hate_crime, lag=12)))
## Warning in adf.test(diff(diff(blackHC$hate_crime, lag = 12))): p-value smaller
## than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(diff(blackHC$hate_crime, lag = 12))
## Dickey-Fuller = -9.2152, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(diff(blackHC$hate_crime, lag=12)))
## Warning in pp.test(diff(diff(blackHC$hate_crime, lag = 12))): p-value smaller
## than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: diff(diff(blackHC$hate_crime, lag = 12))
## Dickey-Fuller Z(alpha) = -342.11, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(diff(blackHC$hate_crime, lag=12)))
## Warning in kpss.test(diff(diff(blackHC$hate_crime, lag = 12))): p-value greater
## than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: diff(diff(blackHC$hate_crime, lag = 12))
## KPSS Level = 0.025647, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(diff(blackHC$hate_crime, lag=12)))) #pass all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -129.48 -9.84 11.11 32.03 491.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -1.11638 0.14955 -7.465 7.17e-13 ***
## yd.diff.lag1 -0.07589 0.13434 -0.565 0.57250
## yd.diff.lag2 -0.21318 0.11169 -1.909 0.05715 .
## yd.diff.lag3 -0.22730 0.08527 -2.666 0.00805 **
## yd.diff.lag4 -0.07329 0.05512 -1.330 0.18455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.16 on 337 degrees of freedom
## Multiple R-squared: 0.5991, Adjusted R-squared: 0.5931
## F-statistic: 100.7 on 5 and 337 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -7.4651
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(diff(diff(blackHC$hate_crime, lag=12))) #no huge seasonal pattern. do both diff.
tsplot(log(blackHC$hate_crime)) #doesn't fix the issue with non-constant variance
#black hate crime
tsplot(diff(diff(blackHC$hate_crime, lag=12)))
acf(diff(diff(blackHC$hate_crime, lag=12)), lag=50) #lag 1,2,12. MA(2) and SMA(1)
pacf(diff(diff(blackHC$hate_crime, lag=12)), lag=50) #cut off at 12, SAR(1)
#1. start with SMA(1) included
mod6B=Arima(blackHC$hate_crime,order=c(0,1,0), seasonal=list(order=c(0,1,1),period=12), include.drift = TRUE) #has warning, can't include drift if we do 2 differencing
## Warning in Arima(blackHC$hate_crime, order = c(0, 1, 0), seasonal = list(order =
## c(0, : No drift term fitted as the order of difference is 2 or more.
mod6.2B=Arima(blackHC$hate_crime,order=c(0,1,0), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
summary(mod6.2B) #AIC=3535.3 AICc=3535.34 BIC=3543
## Series: blackHC$hate_crime
## ARIMA(0,1,0)(0,1,1)[12]
##
## Coefficients:
## sma1
## -0.7664
## s.e. 0.0574
##
## sigma^2 = 1497: log likelihood = -1765.65
## AIC=3535.3 AICc=3535.34 BIC=3543
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.6671603 37.92833 22.38026 -1.091368 10.36432 0.7878519
## ACF1
## Training set -0.2021082
tsplot(mod6.2B$residuals) #fairly okay
adf.test(mod6.2B$residuals) #stationary
## Warning in adf.test(mod6.2B$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod6.2B$residuals
## Dickey-Fuller = -9.6037, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod6.2B$residuals) #stationary
## Warning in pp.test(mod6.2B$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod6.2B$residuals
## Dickey-Fuller Z(alpha) = -341.02, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod6.2B$residuals) #stationary
## Warning in kpss.test(mod6.2B$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod6.2B$residuals
## KPSS Level = 0.098712, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod6.2B$residuals)) #stationary
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -123.35 -14.85 2.22 15.69 466.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -2.14489 0.18504 -11.591 < 2e-16 ***
## yd.diff.lag1 0.82165 0.16083 5.109 5.34e-07 ***
## yd.diff.lag2 0.46843 0.12803 3.659 0.000292 ***
## yd.diff.lag3 0.22233 0.09161 2.427 0.015727 *
## yd.diff.lag4 0.13157 0.05497 2.394 0.017213 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.44 on 350 degrees of freedom
## Multiple R-squared: 0.6458, Adjusted R-squared: 0.6408
## F-statistic: 127.6 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -11.5913
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod6.2B$residuals, lag=100) #ma(2) most obvious
pacf(mod6.2B$residuals, lag=100) #1,2,3 significant
#2. add MA(2)
mod7B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
summary(mod7B) #AIC=3478.41 AICc=3478.53 BIC=3493.81
## Series: blackHC$hate_crime
## ARIMA(0,1,2)(0,1,1)[12]
##
## Coefficients:
## ma1 ma2 sma1
## -0.3883 -0.2895 -0.7777
## s.e. 0.0498 0.0497 0.0571
##
## sigma^2 = 1259: log likelihood = -1735.2
## AIC=3478.41 AICc=3478.53 BIC=3493.81
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -1.088744 34.6865 19.38158 -1.575956 9.117224 0.6822895 0.02488984
tsplot(mod7B$residuals) #fairly okay
adf.test(mod7B$residuals) #stationary
## Warning in adf.test(mod7B$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod7B$residuals
## Dickey-Fuller = -6.8834, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod7B$residuals) #stationary
## Warning in pp.test(mod7B$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod7B$residuals
## Dickey-Fuller Z(alpha) = -344.37, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod7B$residuals) #0.042, failed
##
## KPSS Test for Level Stationarity
##
## data: mod7B$residuals
## KPSS Level = 0.49854, Truncation lag parameter = 5, p-value = 0.042
summary(ur.ers(mod7B$residuals)) #stationary
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.98 -15.68 1.07 14.59 467.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -1.01741 0.11879 -8.565 3.48e-16 ***
## yd.diff.lag1 0.05184 0.10699 0.485 0.628
## yd.diff.lag2 0.08170 0.09176 0.890 0.374
## yd.diff.lag3 0.02724 0.07583 0.359 0.720
## yd.diff.lag4 0.07757 0.05431 1.428 0.154
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.94 on 350 degrees of freedom
## Multiple R-squared: 0.4895, Adjusted R-squared: 0.4822
## F-statistic: 67.12 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -8.5651
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod7B$residuals, lag=50) #looks okay
pacf(mod7B$residuals, lag=50) #looks okay
#3. add additional terms to see if our current model is the best.
#add SAR(1)
mod8B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(1,1,1),period=12), include.drift =FALSE)
summary(mod8B) #AIC=3480.22 AICc=3480.4 BIC=3499.47 #got worse
## Series: blackHC$hate_crime
## ARIMA(0,1,2)(1,1,1)[12]
##
## Coefficients:
## ma1 ma2 sar1 sma1
## -0.3887 -0.2889 -0.0429 -0.7609
## s.e. 0.0499 0.0497 0.0990 0.0713
##
## sigma^2 = 1262: log likelihood = -1735.11
## AIC=3480.22 AICc=3480.4 BIC=3499.47
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -1.050696 34.67547 19.3509 -1.557614 9.10181 0.6812094 0.02457845
#add AR(1)
mod9B=Arima(blackHC$hate_crime,order=c(1,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
summary(mod9B) #AIC=3478.09 AICc=3478.27 BIC=3497.34 #AIC slightly improved, BIC got much worse
## Series: blackHC$hate_crime
## ARIMA(1,1,2)(0,1,1)[12]
##
## Coefficients:
## ar1 ma1 ma2 sma1
## 0.2614 -0.6237 -0.1694 -0.7739
## s.e. 0.1692 0.1669 0.1045 0.0580
##
## sigma^2 = 1254: log likelihood = -1734.05
## AIC=3478.09 AICc=3478.27 BIC=3497.34
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1.131123 34.57228 19.40483 -1.590025 9.119137 0.6831079
## ACF1
## Training set 0.0006896171
tsplot(mod9B$residuals) #fairly okay
adf.test(mod9B$residuals) #stationary
## Warning in adf.test(mod9B$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod9B$residuals
## Dickey-Fuller = -6.5631, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod9B$residuals) #stationary
## Warning in pp.test(mod9B$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod9B$residuals
## Dickey-Fuller Z(alpha) = -356.51, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod9B$residuals) #0.02246, failed
##
## KPSS Test for Level Stationarity
##
## data: mod9B$residuals
## KPSS Level = 0.60193, Truncation lag parameter = 5, p-value = 0.02246
summary(ur.ers(mod9B$residuals)) #stationary
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.32 -15.66 0.82 14.12 467.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.97879 0.11986 -8.166 5.85e-15 ***
## yd.diff.lag1 -0.01544 0.10871 -0.142 0.887
## yd.diff.lag2 -0.01977 0.09379 -0.211 0.833
## yd.diff.lag3 -0.02640 0.07638 -0.346 0.730
## yd.diff.lag4 0.05050 0.05403 0.935 0.351
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.9 on 350 degrees of freedom
## Multiple R-squared: 0.5005, Adjusted R-squared: 0.4934
## F-statistic: 70.14 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -8.1661
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod9B$residuals, lag=100) #looks okay
pacf(mod9B$residuals, lag=100) #looks okay
#add SMA(1)
mod10B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,2),period=12), include.drift =FALSE)
summary(mod10B) #AIC=3480.16 AICc=3480.33 BIC=3499.4, got worse
## Series: blackHC$hate_crime
## ARIMA(0,1,2)(0,1,2)[12]
##
## Coefficients:
## ma1 ma2 sma1 sma2
## -0.3890 -0.2888 -0.8128 0.0452
## s.e. 0.0499 0.0497 0.0909 0.0899
##
## sigma^2 = 1262: log likelihood = -1735.08
## AIC=3480.16 AICc=3480.33 BIC=3499.4
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1.032989 34.67153 19.34546 -1.549093 9.099475 0.6810178
## ACF1
## Training set 0.02453064
#4. fit auto.arima
mod11B=auto.arima(blackHC$hate_crime)
summary(mod11B) #(2,1,2), AIC=3664.08 AICc=3664.25 BIC=3683.49
## Series: blackHC$hate_crime
## ARIMA(2,1,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.5493 -0.7173 -1.8274 0.8925
## s.e. 0.0742 0.0549 0.0705 0.0668
##
## sigma^2 = 1553: log likelihood = -1827.04
## AIC=3664.08 AICc=3664.25 BIC=3683.49
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.761113 39.12775 25.68421 -1.255502 12.53004 0.9041606
## ACF1
## Training set -0.03501769
tsplot(mod11B$residuals) #fairly okay
adf.test(mod11B$residuals) #stationary
## Warning in adf.test(mod11B$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod11B$residuals
## Dickey-Fuller = -6.995, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod11B$residuals) #stationary
## Warning in pp.test(mod11B$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod11B$residuals
## Dickey-Fuller Z(alpha) = -377.87, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod11B$residuals) #stationary
## Warning in kpss.test(mod11B$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod11B$residuals
## KPSS Level = 0.12662, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod11B$residuals)) #stationary
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96.39 -18.44 1.32 21.63 461.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.94634 0.12071 -7.840 5.49e-14 ***
## yd.diff.lag1 -0.08654 0.10956 -0.790 0.430
## yd.diff.lag2 -0.10086 0.09547 -1.056 0.291
## yd.diff.lag3 -0.07328 0.07746 -0.946 0.345
## yd.diff.lag4 -0.01334 0.05390 -0.247 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.57 on 350 degrees of freedom
## Multiple R-squared: 0.5154, Adjusted R-squared: 0.5085
## F-statistic: 74.45 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -7.8398
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod11B$residuals, lag=100) #some seasonal pattern
pacf(mod11B$residuals, lag=100) #some seasonal lag, not good
#comparing 6 different SARIMA model, our best model is
mod7B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
summary(mod7B) #AIC=3478.41 AICc=3478.53 BIC=3493.81
## Series: blackHC$hate_crime
## ARIMA(0,1,2)(0,1,1)[12]
##
## Coefficients:
## ma1 ma2 sma1
## -0.3883 -0.2895 -0.7777
## s.e. 0.0498 0.0497 0.0571
##
## sigma^2 = 1259: log likelihood = -1735.2
## AIC=3478.41 AICc=3478.53 BIC=3493.81
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -1.088744 34.6865 19.38158 -1.575956 9.117224 0.6822895 0.02488984
coeftest(mod7B)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -0.388306 0.049826 -7.7932 6.532e-15 ***
## ma2 -0.289509 0.049716 -5.8233 5.769e-09 ***
## sma1 -0.777704 0.057065 -13.6285 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(fitted(mod7B), blackHC$hate_crime)^2 #0.6875213
## [1] 0.6875213
#check normality
qqPlot(mod7B$residuals) #fairly normal but some outliers
## [1] 354 37
densityplot(as.numeric(mod7B$residuals)) #some outliers
shapiro.test(mod7B$residuals) #not normal, we can't trust forecast interval unless dropping outliers and test them again
##
## Shapiro-Wilk normality test
##
## data: mod7B$residuals
## W = 0.63638, p-value < 2.2e-16
Best model is SARIMA (0,1,2)x(0,1,1).
t=1:360
#best function of time model: seasonal model
mod5B=lm(hate_crime~t+as.factor(month)+0, data=blackHC)
#summary(mod5B) #Adjusted R-squared: 0.9388
AIC(mod5B) #3902.605
## [1] 3902.605
BIC(mod5B) #3957.01
## [1] 3957.01
#best SARIMA model
mod7B=Arima(blackHC$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift =FALSE)
#summary(mod7B) #AIC=3478.41 AICc=3478.53 BIC=3493.81
#compare forecasting performance of two models
b=blackHC[1:348,] #including only through 2018
t=1:348
foftimeModB = lm(hate_crime~t+as.factor(month)+0, data=b)
AIC(foftimeModB) #3584.569
## [1] 3584.569
BIC(foftimeModB) #3638.5
## [1] 3638.5
foftimePredB= predict(foftimeModB, data.frame(t=349:360, month=1:12))
sarimaModB=Arima(b$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
#summary(sarimaModB) #AIC=3072.28 AICc=3072.4 BIC=3087.53
sarimaBPred=forecast(sarimaModB,h=12)$mean
#real data
real_value2=blackHC$hate_crime[c(349:360)]
#compare with real data
cor(real_value2,foftimePredB)^2 # R^2 = 0.2092973
## [1] 0.2092973
cor(real_value2,sarimaBPred)^2 # R^2 = 0.3096983, this has better prediction
## [1] 0.3096983
SARIMA model is better.
#plot prediction
pred_blackHC=forecast(mod7B,h=24) #two years, 2021 and 2022
pred_blackHC
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 361 225.7696 180.2951 271.2442 156.2223 295.3170
## 362 262.9636 209.6561 316.2712 181.4367 344.4905
## 363 274.7727 219.4884 330.0570 190.2226 359.3227
## 364 279.5197 222.3269 336.7125 192.0509 366.9885
## 365 307.4694 248.4298 366.5090 217.1762 397.7627
## 366 406.3728 345.5424 467.2031 313.3408 499.4048
## 367 367.2499 304.6801 429.8198 271.5576 462.9423
## 368 352.8507 288.5883 417.1130 254.5699 451.1314
## 369 327.7986 261.8872 393.7099 226.9959 428.6013
## 370 326.3496 258.8295 393.8697 223.0865 429.6127
## 371 296.7755 227.6841 365.8669 191.1094 402.4417
## 372 260.7437 190.1159 331.3715 152.7279 368.7595
## 373 270.0399 195.1978 344.8820 155.5787 384.5010
## 374 288.0078 210.3198 365.6958 169.1943 406.8214
## 375 299.8169 220.0916 379.5422 177.8875 421.7463
## 376 304.5639 222.8521 386.2758 179.5964 429.5314
## 377 332.5136 248.8624 416.1649 204.5801 460.4471
## 378 431.4170 345.8703 516.9636 300.5847 562.2493
## 379 392.2941 304.8932 479.6951 258.6259 525.9624
## 380 377.8949 288.6781 467.1116 241.4496 514.3401
## 381 352.8428 261.8464 443.8391 213.6759 492.0097
## 382 351.3938 258.6520 444.1355 209.5575 493.2301
## 383 321.8198 227.3648 416.2747 177.3634 466.2761
## 384 285.7879 189.6503 381.9255 138.7582 432.8177
plot(pred_blackHC, main="Prediction on anti-African American or Black hate crimes", xlab="Time", ylab="Hate crimes")
forecasting3<-sarima.for(blackHC$hate_crime, 0,1,2, 0,1,1, 12, n.ahead = 24)
forecasting3$pred
## Time Series:
## Start = 361
## End = 384
## Frequency = 1
## [1] 225.7696 262.9636 274.7727 279.5197 307.4694 406.3728 367.2499 352.8507
## [9] 327.7986 326.3496 296.7755 260.7437 270.0399 288.0078 299.8169 304.5639
## [17] 332.5136 431.4170 392.2941 377.8949 352.8428 351.3938 321.8198 285.7879
summary(asianHC)
## year month hate_crime difference
## Min. :1991 Min. : 1.00 Min. : 3.00 Min. :-26.00000
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.:12.00 1st Qu.: -4.00000
## Median :2006 Median : 6.50 Median :17.00 Median : 0.00000
## Mean :2006 Mean : 6.50 Mean :17.84 Mean : 0.01944
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:23.00 3rd Qu.: 4.00000
## Max. :2020 Max. :12.00 Max. :51.00 Max. : 39.00000
ts5<- xts(asianHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", asianHC$year, asianHC$month)))
ts6 <- xts(asianHC$hate_crime, as.yearmon(asianHC$year + (asianHC$month-1)/12))
plot(ts6, main="Anti-Asian Hate Crimes")
#par(mfrow=1:2)
#for (i in 1:30){
# start=12*(i-1)+1
# end=12*i
# print(plot(ts6[start:end]))
#}
#start with linear model
t=1:360
mod1A=lm(hate_crime~t, data=asianHC)
summary(mod1A) #Adjusted R-squared: 0.245
##
## Call:
## lm(formula = hate_crime ~ t, data = asianHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.500 -4.766 -1.392 3.695 39.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.008651 0.762691 32.79 <2e-16 ***
## t -0.039691 0.003662 -10.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.22 on 358 degrees of freedom
## Multiple R-squared: 0.2471, Adjusted R-squared: 0.245
## F-statistic: 117.5 on 1 and 358 DF, p-value: < 2.2e-16
#checking stationarity
plot(mod1A$residuals) #doesn't look stationray, fit quadratic trend
lines(lowess(mod1A$residuals))
#quadratic model
mod2A=lm(hate_crime~t+I(t^2), data=asianHC)
summary(mod2A) #Adjusted R-squared: 0.2607
##
## Call:
## lm(formula = hate_crime ~ t + I(t^2), data = asianHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.419 -4.548 -1.182 3.317 37.833
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.750e+01 1.136e+00 24.208 < 2e-16 ***
## t -8.099e-02 1.453e-02 -5.573 4.94e-08 ***
## I(t^2) 1.144e-04 3.898e-05 2.934 0.00356 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.145 on 357 degrees of freedom
## Multiple R-squared: 0.2648, Adjusted R-squared: 0.2607
## F-statistic: 64.3 on 2 and 357 DF, p-value: < 2.2e-16
#tsplot(mod2B$residuals) #didn't changed much from the original tsplot
plot(mod2A$residuals) #doesn't look stationray
lines(lowess(mod2A$residuals))
#cosine model
asianHC$Xcos=cos(2*pi*t/12)
asianHC$Xsin=sin(2*pi*t/12)
mod3A=lm(hate_crime~t+I(t^2)+Xcos+Xsin,data=asianHC)
summary(mod3A) #Adjusted R-squared: 0.2773 , improved. sin term not sig.
##
## Call:
## lm(formula = hate_crime ~ t + I(t^2) + Xcos + Xsin, data = asianHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.742 -4.693 -1.174 3.722 38.063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.750e+01 1.123e+00 24.479 < 2e-16 ***
## t -8.100e-02 1.437e-02 -5.637 3.52e-08 ***
## I(t^2) 1.145e-04 3.854e-05 2.971 0.00317 **
## Xcos -1.666e+00 5.265e-01 -3.164 0.00169 **
## Xsin -2.391e-01 5.267e-01 -0.454 0.65009
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.064 on 355 degrees of freedom
## Multiple R-squared: 0.2854, Adjusted R-squared: 0.2773
## F-statistic: 35.44 on 4 and 355 DF, p-value: < 2.2e-16
tsplot(mod3A$residuals)
plot(mod3A$residuals)
lines(lowess(mod3A$residuals))#better than two models from above
#cosine model, dropping sine terms
mod4A=lm(hate_crime~t+I(t^2)+Xcos,data=asianHC)
summary(mod4A) #Adjusted R-squared: 0.2789 got slightly better
##
## Call:
## lm(formula = hate_crime ~ t + I(t^2) + Xcos, data = asianHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.735 -4.699 -1.162 3.694 37.817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.4890270 1.1219142 24.502 < 2e-16 ***
## t -0.0809557 0.0143517 -5.641 3.45e-08 ***
## I(t^2) 0.0001145 0.0000385 2.975 0.00313 **
## Xcos -1.6659519 0.5259498 -3.168 0.00167 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.056 on 356 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.2789
## F-statistic: 47.29 on 3 and 356 DF, p-value: < 2.2e-16
tsplot(mod4A$residuals) #looks like it got rid of seasonal trend, but still not stationary
plot(mod4A$residuals)
lines(lowess(mod4A$residuals))#didn't get better
#seasonal model
mod5A=lm(hate_crime~0+t+as.factor(month)+0, data=asianHC)
summary(mod5A) #Adjusted R-squared: 0.8687
##
## Call:
## lm(formula = hate_crime ~ 0 + t + as.factor(month) + 0, data = asianHC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.096 -4.653 -1.517 3.939 38.639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## t -0.039686 0.003619 -10.97 <2e-16 ***
## as.factor(month)1 22.778396 1.447780 15.73 <2e-16 ***
## as.factor(month)2 22.451415 1.449366 15.49 <2e-16 ***
## as.factor(month)3 26.291101 1.450960 18.12 <2e-16 ***
## as.factor(month)4 26.597454 1.452561 18.31 <2e-16 ***
## as.factor(month)5 27.037140 1.454169 18.59 <2e-16 ***
## as.factor(month)6 25.810159 1.455784 17.73 <2e-16 ***
## as.factor(month)7 25.883179 1.457407 17.76 <2e-16 ***
## as.factor(month)8 24.722865 1.459036 16.95 <2e-16 ***
## as.factor(month)9 25.529218 1.460673 17.48 <2e-16 ***
## as.factor(month)10 26.835570 1.462317 18.35 <2e-16 ***
## as.factor(month)11 23.975256 1.463968 16.38 <2e-16 ***
## as.factor(month)12 22.181609 1.465627 15.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.131 on 347 degrees of freedom
## Multiple R-squared: 0.8734, Adjusted R-squared: 0.8687
## F-statistic: 184.2 on 13 and 347 DF, p-value: < 2.2e-16
tsplot(mod5A$residuals)
plot(mod5A$residuals)
lines(lowess(mod5A$residuals))#still not good, rather than detrending the data, might need to do differencing to make it stationary
None of the model fixed issue of non-linearity observed from the residuals. We cannot use function of time models for the prediction.
tsplot(asianHC$hate_crime) #doesn't look stationary
adf.test(asianHC$hate_crime) #fail
##
## Augmented Dickey-Fuller Test
##
## data: asianHC$hate_crime
## Dickey-Fuller = -3.3697, Lag order = 7, p-value = 0.05946
## alternative hypothesis: stationary
pp.test(asianHC$hate_crime)
## Warning in pp.test(asianHC$hate_crime): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: asianHC$hate_crime
## Dickey-Fuller Z(alpha) = -183.41, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(asianHC$hate_crime) #fail
## Warning in kpss.test(asianHC$hate_crime): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: asianHC$hate_crime
## KPSS Level = 2.9982, Truncation lag parameter = 5, p-value = 0.01
summary(ur.ers(asianHC$hate_crime))
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.292 -3.265 0.156 3.992 36.536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.07523 0.03104 -2.424 0.01586 *
## yd.diff.lag1 -0.50220 0.05770 -8.704 < 2e-16 ***
## yd.diff.lag2 -0.34276 0.06209 -5.521 6.59e-08 ***
## yd.diff.lag3 -0.18076 0.06059 -2.984 0.00305 **
## yd.diff.lag4 -0.09133 0.05309 -1.720 0.08625 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.023 on 350 degrees of freedom
## Multiple R-squared: 0.2575, Adjusted R-squared: 0.2469
## F-statistic: 24.27 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -2.4239
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(asianHC$hate_crime, lag=100) #doesn't look stationary, might need regular differencing
tsplot(diff(asianHC$hate_crime)) #looks better but some concern with variance
adf.test(diff(asianHC$hate_crime))
## Warning in adf.test(diff(asianHC$hate_crime)): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: diff(asianHC$hate_crime)
## Dickey-Fuller = -10.455, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(asianHC$hate_crime))
## Warning in pp.test(diff(asianHC$hate_crime)): p-value smaller than printed p-
## value
##
## Phillips-Perron Unit Root Test
##
## data: diff(asianHC$hate_crime)
## Dickey-Fuller Z(alpha) = -407.93, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(asianHC$hate_crime))
## Warning in kpss.test(diff(asianHC$hate_crime)): p-value greater than printed p-
## value
##
## KPSS Test for Level Stationarity
##
## data: diff(asianHC$hate_crime)
## KPSS Level = 0.028332, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(asianHC$hate_crime))) #pass all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.207 -3.989 -0.429 3.231 36.600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -2.65476 0.20266 -13.099 < 2e-16 ***
## yd.diff.lag1 1.07717 0.17667 6.097 2.86e-09 ***
## yd.diff.lag2 0.64847 0.14000 4.632 5.12e-06 ***
## yd.diff.lag3 0.36672 0.09704 3.779 0.000185 ***
## yd.diff.lag4 0.16308 0.05257 3.102 0.002078 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.988 on 349 degrees of freedom
## Multiple R-squared: 0.7367, Adjusted R-squared: 0.733
## F-statistic: 195.3 on 5 and 349 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -13.0994
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(diff(asianHC$hate_crime), lag=100) #doesn't look stationary,but better than above.
#asian hate crime
tsplot(diff(asianHC$hate_crime))
acf(diff(asianHC$hate_crime), lag=50) #lag 1, MA(1)
pacf(diff(asianHC$hate_crime), lag=50) #cut off at 12, SAR(1)
#1. start with SAR(1) included
mod6A=Arima(asianHC$hate_crime,order=c(0,1,0), seasonal=list(order=c(1,0,0),period=12), include.drift = TRUE) #has warning, can't include drift if we do 2 differencing
summary(mod6A) #AIC=2411.86 AICc=2411.93 BIC=2423.51
## Series: asianHC$hate_crime
## ARIMA(0,1,0)(1,0,0)[12] with drift
##
## Coefficients:
## sar1 drift
## 0.0920 0.0210
## s.e. 0.0557 0.3998
##
## sigma^2 = 47.89: log likelihood = -1202.93
## AIC=2411.86 AICc=2411.93 BIC=2423.51
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.002045848 6.891671 5.126097 -8.685741 33.34394 0.9909902
## ACF1
## Training set -0.3949622
coeftest(mod6A) #drift not sig.
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## sar1 0.092017 0.055728 1.6512 0.09871 .
## drift 0.021049 0.399796 0.0526 0.95801
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod6.2A=Arima(asianHC$hate_crime,order=c(0,1,0),seasonal=list(order=c(1,0,0),period=12), include.drift =FALSE)
summary(mod6.2A) #AIC=2409.87 AICc=2409.9 BIC=2417.63 #better
## Series: asianHC$hate_crime
## ARIMA(0,1,0)(1,0,0)[12]
##
## Coefficients:
## sar1
## 0.0920
## s.e. 0.0557
##
## sigma^2 = 47.76: log likelihood = -1202.93
## AIC=2409.87 AICc=2409.9 BIC=2417.63
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01707476 6.891697 5.125928 -8.55184 33.31935 0.9909575
## ACF1
## Training set -0.3949598
tsplot(mod6.2A$residuals) #fairly okay, outlier
adf.test(mod6.2A$residuals) #stationary
## Warning in adf.test(mod6.2A$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod6.2A$residuals
## Dickey-Fuller = -10.103, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod6.2A$residuals) #stationary
## Warning in pp.test(mod6.2A$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod6.2A$residuals
## Dickey-Fuller Z(alpha) = -412.22, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod6.2A$residuals) #stationary
## Warning in kpss.test(mod6.2A$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod6.2A$residuals
## KPSS Level = 0.024991, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod6.2A$residuals)) #stationary
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.844 -3.898 -0.273 3.143 36.224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -2.60904 0.20230 -12.897 < 2e-16 ***
## yd.diff.lag1 1.03044 0.17641 5.841 1.19e-08 ***
## yd.diff.lag2 0.61514 0.14000 4.394 1.48e-05 ***
## yd.diff.lag3 0.34612 0.09728 3.558 0.000425 ***
## yd.diff.lag4 0.15670 0.05264 2.977 0.003117 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.971 on 350 degrees of freedom
## Multiple R-squared: 0.7371, Adjusted R-squared: 0.7333
## F-statistic: 196.3 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -12.8966
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod6.2A$residuals, lag=50) #lag at 1, MA(1) more obvious
pacf(mod6.2A$residuals, lag=50) #1,2, 3,5 signifcant, try AR(1)
#2. add MA(1)
mod7A=Arima(asianHC$hate_crime,order=c(0,1,1), seasonal=list(order=c(1,0,0),period=12), include.drift =FALSE)
summary(mod7A) #AIC=2301.27 AICc=2301.34 BIC=2312.92 #better
## Series: asianHC$hate_crime
## ARIMA(0,1,1)(1,0,0)[12]
##
## Coefficients:
## ma1 sar1
## -0.7400 0.1081
## s.e. 0.0615 0.0603
##
## sigma^2 = 35.11: log likelihood = -1147.63
## AIC=2301.27 AICc=2301.34 BIC=2312.92
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.105185 5.901044 4.425677 -9.26446 28.25707 0.8555833 0.1232071
tsplot(mod7A$residuals) #fairly okay
adf.test(mod7A$residuals) #stationary
## Warning in adf.test(mod7A$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod7A$residuals
## Dickey-Fuller = -8.3069, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod7A$residuals) #stationary
## Warning in pp.test(mod7A$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod7A$residuals
## Dickey-Fuller Z(alpha) = -297.59, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod7A$residuals) #stationary
## Warning in kpss.test(mod7A$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod7A$residuals
## KPSS Level = 0.072445, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod7A$residuals)) #stationary
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.627 -3.459 -0.044 3.320 36.058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -1.05619 0.11235 -9.401 <2e-16 ***
## yd.diff.lag1 0.17189 0.09949 1.728 0.0849 .
## yd.diff.lag2 0.14471 0.08643 1.674 0.0950 .
## yd.diff.lag3 0.13784 0.07145 1.929 0.0545 .
## yd.diff.lag4 0.09184 0.05367 1.711 0.0879 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.87 on 350 degrees of freedom
## Multiple R-squared: 0.4462, Adjusted R-squared: 0.4383
## F-statistic: 56.39 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -9.4008
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod7A$residuals, lag=50) #small lag at 1.
pacf(mod7A$residuals, lag=50) #lag 1 sig. AR(1)
#3. add AR(1)
mod8A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,0),period=12), include.drift =FALSE)
summary(mod8A) #AIC=2289.04 AICc=2289.16 BIC=2304.58 #better
## Series: asianHC$hate_crime
## ARIMA(1,1,1)(1,0,0)[12]
##
## Coefficients:
## ar1 ma1 sar1
## 0.2633 -0.8919 0.1089
## s.e. 0.0634 0.0310 0.0596
##
## sigma^2 = 33.82: log likelihood = -1140.52
## AIC=2289.04 AICc=2289.16 BIC=2304.58
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.1467428 5.783 4.32512 -9.216289 27.78039 0.8361433 -0.01144035
tsplot(mod8A$residuals) #fairly okay
adf.test(mod8A$residuals) #stationary
## Warning in adf.test(mod8A$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod8A$residuals
## Dickey-Fuller = -7.0533, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod8A$residuals) #stationary
## Warning in pp.test(mod8A$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod8A$residuals
## Dickey-Fuller Z(alpha) = -383.84, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod8A$residuals) #stationary
## Warning in kpss.test(mod8A$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod8A$residuals
## KPSS Level = 0.18526, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod8A$residuals)) #stationary
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.280 -3.612 -0.159 3.336 36.297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.93726 0.11441 -8.192 4.87e-15 ***
## yd.diff.lag1 -0.08013 0.10400 -0.770 0.442
## yd.diff.lag2 -0.03269 0.09208 -0.355 0.723
## yd.diff.lag3 0.03389 0.07614 0.445 0.656
## yd.diff.lag4 0.05033 0.05340 0.943 0.347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.809 on 350 degrees of freedom
## Multiple R-squared: 0.5114, Adjusted R-squared: 0.5044
## F-statistic: 73.26 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -8.1922
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod8A$residuals, lag=50) #looks okay
pacf(mod8A$residuals, lag=50) #looks okay
#4. auto arima
mod9A=auto.arima(asianHC$hate_crime)
summary(mod9A) #(2,1,2), AIC=2293.57 AICc=2293.74 BIC=2312.99 not better
## Series: asianHC$hate_crime
## ARIMA(2,1,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 0.5687 -0.0288 -1.2067 0.2743
## s.e. 0.5477 0.1608 0.5446 0.4839
##
## sigma^2 = 34.17: log likelihood = -1141.78
## AIC=2293.57 AICc=2293.74 BIC=2312.99
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1325418 5.804518 4.353629 -9.451927 28.04619 0.8416546
## ACF1
## Training set 0.001352735
tsplot(mod9A$residuals)
acf(mod9A$residuals)
pacf(mod9A$residuals) #looks okay but AIC and BIC is better in mod8A
#5. add extra term and see if model gets better
#add SMA(1)
mod10A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift =FALSE)
coeftest(mod10A) #all terms sig.
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.237805 0.066895 3.5549 0.0003781 ***
## ma1 -0.879696 0.034709 -25.3447 < 2.2e-16 ***
## sar1 0.910623 0.099283 9.1720 < 2.2e-16 ***
## sma1 -0.837127 0.131801 -6.3514 2.133e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod10A) #AIC=2285.7 AICc=2285.87 BIC=2305.12 #AIC got better, BIC got worse, since we aim to do prediction, select model with better AIC.
## Series: asianHC$hate_crime
## ARIMA(1,1,1)(1,0,1)[12]
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.2378 -0.8797 0.9106 -0.8371
## s.e. 0.0669 0.0347 0.0993 0.1318
##
## sigma^2 = 33.32: log likelihood = -1137.85
## AIC=2285.7 AICc=2285.87 BIC=2305.12
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1810031 5.731914 4.269975 -8.476687 27.25652 0.8254825
## ACF1
## Training set -0.009022443
tsplot(mod10A$residuals) #fairly okay
adf.test(mod10A$residuals) #stationary
## Warning in adf.test(mod10A$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: mod10A$residuals
## Dickey-Fuller = -7.0277, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod10A$residuals) #stationary
## Warning in pp.test(mod10A$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: mod10A$residuals
## Dickey-Fuller Z(alpha) = -380.79, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod10A$residuals) #stationary
## Warning in kpss.test(mod10A$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: mod10A$residuals
## KPSS Level = 0.18343, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(mod10A$residuals)) #stationary
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.553 -3.533 -0.251 3.305 36.495
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.93549 0.11487 -8.144 6.81e-15 ***
## yd.diff.lag1 -0.07901 0.10446 -0.756 0.450
## yd.diff.lag2 -0.04352 0.09237 -0.471 0.638
## yd.diff.lag3 0.02606 0.07612 0.342 0.732
## yd.diff.lag4 0.04763 0.05344 0.891 0.373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.759 on 350 degrees of freedom
## Multiple R-squared: 0.5097, Adjusted R-squared: 0.5027
## F-statistic: 72.77 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -8.1442
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(mod10A$residuals, lag=50) #looks okay
pacf(mod10A$residuals, lag=50) #looks okay
#comparing 6 different SARIMA models, our best model is:
mod10A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift =FALSE)
summary(mod10A) #AIC=2285.7 AICc=2285.87 BIC=2305.12
## Series: asianHC$hate_crime
## ARIMA(1,1,1)(1,0,1)[12]
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.2378 -0.8797 0.9106 -0.8371
## s.e. 0.0669 0.0347 0.0993 0.1318
##
## sigma^2 = 33.32: log likelihood = -1137.85
## AIC=2285.7 AICc=2285.87 BIC=2305.12
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1810031 5.731914 4.269975 -8.476687 27.25652 0.8254825
## ACF1
## Training set -0.009022443
coeftest(mod10A)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.237805 0.066895 3.5549 0.0003781 ***
## ma1 -0.879696 0.034709 -25.3447 < 2.2e-16 ***
## sar1 0.910623 0.099283 9.1720 < 2.2e-16 ***
## sma1 -0.837127 0.131801 -6.3514 2.133e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(fitted(mod10A), asianHC$hate_crime)^2 #0.52753
## [1] 0.52753
#check normality
qqPlot(mod10A$residuals) #fairly normal only several outliers
## [1] 351 352
densityplot(as.numeric(mod10A$residuals)) #some outliers
shapiro.test(mod10A$residuals) #not normal, we can't trust forecast interval unless dropping outliers and test them again
##
## Shapiro-Wilk normality test
##
## data: mod10A$residuals
## W = 0.94955, p-value = 9.472e-10
Best model is SARIMA (1,1,1)x(1,0,1).
#best function of time model: seasonal model
t=1:360
mod5A=lm(hate_crime~0+t+as.factor(month), data=asianHC)
#summary(mod5A) #Adjusted R-squared: 0.8716
AIC(mod5A) #2450.807 but residuals were not linear
## [1] 2450.807
BIC(mod5A) #2505.212
## [1] 2505.212
#best SARIMA model for the diff data:
mod10A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift =FALSE)
#summary(mod10A) #AIC=2285.7 AICc=2285.87 BIC=2305.12
#compare forecasting performance of two models
c=asianHC[1:348,] #including only through 2018
t=1:348
foftimeModA = lm(hate_crime~0+t+I(t^2)+as.factor(month), data=c)
AIC(foftimeModA) #2260.843
## [1] 2260.843
BIC(foftimeModA) #2318.626
## [1] 2318.626
foftimePredA= predict(foftimeModA, data.frame(t=349:360, month=1:12))
sarimaModA=Arima(c$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift = FALSE)
#summary(sarimaModA) #AIC=2148.5 AICc=2148.67 BIC=2167.74
sarimaAPred=forecast(sarimaModA,h=12)$mean
#real data
real_value3=asianHC$hate_crime[c(349:360)]
#compare with real data
cor(real_value3,foftimePredA)^2 # R^2 = 0.3780229
## [1] 0.3780229
cor(real_value3,sarimaAPred)^2 # R^2 =0.008795079, might be due to surge in 2020?
## [1] 0.008795079
#compare forecasting performance of two models using different subsets
c2=asianHC[1:324,] #including only through 2017
t=1:324
foftimeModA2 = lm(hate_crime~0+t+I(t^2)+as.factor(month), data=c2)
AIC(foftimeModA2) #2089.95
## [1] 2089.95
BIC(foftimeModA2) #2146.661
## [1] 2146.661
foftimePredA2= predict(foftimeModA2, data.frame(t=325:348, month=c(1:12, 1:12))) #predict 2018,2019
sarimaModA2=Arima(c2$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift = FALSE)
#summary(sarimaModA2) #AIC=2013.28 AICc=2013.47 BIC=2032.16
sarimaAPred2=forecast(sarimaModA2,h=24)$mean
sarimaAPred2
## Time Series:
## Start = 325
## End = 348
## Frequency = 1
## [1] 10.824660 11.874799 10.727771 10.601041 11.583699 11.731505 10.057739
## [8] 10.976842 9.944127 10.199161 10.890517 10.432550 10.139316 11.586934
## [15] 10.642463 10.541123 11.417726 11.549579 10.059426 10.877730 9.958281
## [22] 10.185344 10.800873 10.393135
#p=data.frame(sarimaAPred2)
#real data
real_value4=asianHC$hate_crime[c(325:348)]
#compare with real data
cor(real_value4,foftimePredA2)^2 # R^2 = 0.0540556
## [1] 0.0540556
cor(real_value4,sarimaAPred2)^2 # R^2 = 0.1645694
## [1] 0.1645694
SARIMA model is better.
#plot prediction
pred_asianHC=forecast(mod10A,h=24) #two years, 2021
pred_asianHC
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 361 20.08305 12.68576 27.48035 8.769866 31.39624
## 362 21.98624 14.12892 29.84355 9.969509 34.00296
## 363 24.92636 16.92340 32.92933 12.686888 37.16584
## 364 25.14045 17.04025 33.24066 12.752258 37.52865
## 365 24.52689 16.34011 32.71368 12.006283 37.04750
## 366 24.51231 16.24201 32.78261 11.863982 37.16064
## 367 23.29950 14.94703 31.65197 10.525506 36.07349
## 368 22.74114 14.30742 31.17485 9.842882 35.63939
## 369 22.66157 14.14741 31.17573 9.640281 35.68285
## 370 22.86413 14.27029 31.45798 9.720978 36.00729
## 371 22.74483 14.07203 31.41763 9.480922 36.00873
## 372 22.65195 13.90091 31.40299 9.268392 36.03552
## 373 22.26226 13.34548 31.17905 8.625213 35.89931
## 374 23.01848 13.99824 32.03873 9.223204 36.81376
## 375 25.46353 16.35302 34.57404 11.530202 39.39686
## 376 25.60324 16.40607 34.80041 11.537384 39.66910
## 377 25.03138 15.74900 34.31376 10.835207 39.22756
## 378 25.01498 15.64831 34.38164 10.689898 39.34006
## 379 23.90982 14.45966 33.35999 9.457041 38.36260
## 380 23.40119 13.86826 32.93411 8.821835 37.98054
## 381 23.32869 13.71372 32.94366 8.623858 38.03352
## 382 23.51314 13.81682 33.20946 8.683894 38.34239
## 383 23.40449 13.62750 33.18149 8.451867 38.35712
## 384 23.31992 13.46291 33.17693 8.244924 38.39492
plot(pred_asianHC, main="Prediction on anti-Asian hate crimes", xlab="Time", ylab="Hate crimes")
forecasting4<-sarima.for(asianHC$hate_crime, 1,1,1, 1,0,1, 12, n.ahead = 24)
forecasting4$pred
## Time Series:
## Start = 361
## End = 384
## Frequency = 1
## [1] 20.17948 22.12497 25.11162 25.34448 24.73837 24.73971 23.53464 22.98840
## [9] 22.92469 23.14388 23.04077 22.96991 22.59723 23.37714 25.86200 26.02047
## [17] 25.45813 25.45884 24.36393 23.86925 23.81388 24.01604 23.92484 23.86301
tsplot(diff(log(asianHC$hate_crime))) #might fix the issue with variance
adf.test(diff(log(asianHC$hate_crime)))
## Warning in adf.test(diff(log(asianHC$hate_crime))): p-value smaller than printed
## p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(log(asianHC$hate_crime))
## Dickey-Fuller = -10.644, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(log(asianHC$hate_crime)))
## Warning in pp.test(diff(log(asianHC$hate_crime))): p-value smaller than printed
## p-value
##
## Phillips-Perron Unit Root Test
##
## data: diff(log(asianHC$hate_crime))
## Dickey-Fuller Z(alpha) = -419.79, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(log(asianHC$hate_crime)))
## Warning in kpss.test(diff(log(asianHC$hate_crime))): p-value greater than
## printed p-value
##
## KPSS Test for Level Stationarity
##
## data: diff(log(asianHC$hate_crime))
## KPSS Level = 0.033609, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(diff(log(asianHC$hate_crime)))) #pass all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.39007 -0.21611 0.01356 0.22069 1.25530
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -2.90384 0.21353 -13.599 < 2e-16 ***
## yd.diff.lag1 1.23575 0.18625 6.635 1.24e-10 ***
## yd.diff.lag2 0.72880 0.14733 4.947 1.18e-06 ***
## yd.diff.lag3 0.43078 0.09986 4.314 2.09e-05 ***
## yd.diff.lag4 0.16984 0.05229 3.248 0.00127 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3372 on 349 degrees of freedom
## Multiple R-squared: 0.7667, Adjusted R-squared: 0.7634
## F-statistic: 229.4 on 5 and 349 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -13.5994
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(diff(log(asianHC$hate_crime)))